TODO shorten lecture grp levels
nusmods at https://api.nusmods.com/..JSON format, convert to a dataframe.myBid <- data.frame() # create empty dataframe which will act as a container to be populated with data
for(year in c(2011:2018)) # looping through each year
{
for(semester in c(1,2)) # loopinig through semesters
{
if(year == 2017 & semester == 2) # there is no cors biding data for 2017/2018 sem 2
{
} else if(year == 2018 & semester == 2) # there is no cors biding data for 2018/2019 sem 2
{
} else
{
# create the url where data is to be extracted from
myurl <- paste0("https://api.nusmods.com/", year, "-", year + 1, "/", semester, "/corsBiddingStatsRaw.json")
myjson <- fromJSON(file = url(myurl))
for(r in 1:length(myjson)) # for each element in the myjson list, append it to myModInfo
{
if(isTRUE(str_detect(myjson[[r]][["ModuleCode"]], "^PL"))) # only keep info if module code begins with PL
{
if(myjson[[r]][["Semester"]] == "1" | myjson[[r]][["Semester"]] == "2") # only get semester 1 and 2 information
{
myBid <- rbind(myBid, myjson[[r]]) # add to dataframe
}
}
myjson[[r]] <- NA # replace the element with NA to free up some rAM
}
}
cat(year, "Semester", semester, "Done!") # progress tracker
}
}
saveRDS(myBid, file = "myBid.RDS") # save to directorymyBid.RDSmyBid.RDS and load it directly from my local folder while I worked on the project.myModInfo <- data.frame() # create empty dataframe which will act as a container to be populated with data
for(year in c(2011:2018)) # looping through each year
{
for(semester in c(1,2))
{
# create the url where data is to be extracted from
myurl <- paste0("https://api.nusmods.com/", year, "-", year + 1, "/", semester, "/moduleTimetableDeltaRaw.json")
myjson <- fromJSON(file = url(myurl))
for(r in 1:length(myjson)) # for each element in the myjson list, append it to myModInfo
{
if(isTRUE(str_detect(myjson[[r]]$ModuleCode, "^PL"))) # only keep info if module code begins with PL
{
if(myjson[[r]]$Semester == 1 | myjson[[r]]$Semester == 2) # only get semester 1 and 2 information
{
myModInfo <- rbind(myModInfo, myjson[[r]]) # add to dataframe
}
}
myjson[[r]] <- NA # replace the element with NA to free up some rAM
}
cat(year, "Semester", semester, "Done!") # progress tracker
}
}
myTitles <- data.frame() # create empty dataframe which will act as a container to be populated with data
for(year in c(2014:2018)) # looping through each year
{
myurl <- paste0("https://api.nusmods.com/", year, "-", year + 1, "/moduleList.json") # create the url where data is to be extracted from
myjson <- fromJSON(file = url(myurl))
for(r in 1:length(myjson)) # for each element in the myjson list, append it to myModInfo
{
if(isTRUE(str_detect(myjson[[r]]$ModuleCode, "^PL"))) # only keep info if module code begins with PL
{
if(paste0(myjson[[r]]$Semester, collapse = "|") == "1"|
paste0(myjson[[r]]$Semester, collapse = "|") == "2"|
paste0(myjson[[r]]$Semester, collapse = "|") == "1|2") # only keep information from semester 1 and 2
{
myTitles <- rbind(myTitles, as.data.frame(myjson[[r]])) # add to dataframe
}
}
myjson[[r]] <- NA # free RAM
}
}
myModInfo <- myTitles %>% # add titles information to myModInfo
select(ModuleCode, ModuleTitle) %>% # select these two columns
filter(ModuleTitle != "Lab in Applied Psychology") %>%
distinct() %>% # remove duplicates
right_join(myModInfo, by = "ModuleCode") # left = myTitles, right = myModInfo
saveRDS(myModInfo, file = "myModInfo.RDS") # save to directorymyModInfo.RDSmyModInfo.RDS and load the data directly while I worked on the project.myModInfo.
myModInfo <- myModInfo %>%
select(-LastModified, -LastModified_js, -isDelete) %>% # remove these columns
filter(str_detect(ModuleCode, "^PL")) %>% # removing non-Psychology modules
filter(!is.na(ModuleTitle)) %>% # removing modules without module titles #PL3285, PL4220, PL4217
filter(LessonType != "TUTORIAL") %>% # removing information about tutorials
select(AcadYear, Semester, ModuleCode, ModuleTitle, DayText, StartTime, Semester, ClassNo) %>%# select these columns
distinct(AcadYear, Semester, ModuleCode, ClassNo, DayText, StartTime, .keep_all = TRUE) # remove duplicates
modrow <- nrow(myModInfo) # get number of rows of myMoInfo
myModInfo <- myModInfo %>%
mutate(rowindex = 1:modrow) %>% # create new row that is the row number
arrange(-rowindex) %>% # invert the dataframe, make it upside down, reason: latest entry are appended to the bottom of the dataframe!
distinct(AcadYear, Semester, ModuleCode, ClassNo, .keep_all = TRUE) %>% # remove duplicates based on these columns
select(-rowindex) # remove rowindex
tail(myModInfo) # peekmyBid.
myModInfo.myBid <- myBid %>%
filter(str_detect(ModuleCode, "^PL")) %>% # removing non-Psychology modules
filter(!str_detect(ModuleCode, "PLS|PLB")) %>% # remove PLS and PLB modules
filter(!str_detect(StudentAcctType, "Reserved")) %>% # remove reserved rounds
filter(!str_detect(StudentAcctType, "[G]")) %>% # remove bidding information from non-psychology students
select(-Faculty) %>% # remove this columns
mutate(Group1 = gsub("-", "", Group)) %>% # remove hyphens such that it works with parse_number()
mutate(ClassNo = as.character(parse_number(Group1))) # new column signifying which lecture slots for modules with >1 lecture slots
head(myBid) # peekmyModInfo and myBid.# modules that do not appear in both dataframes are dropped
mydata <- inner_join(myBid,
myModInfo,
by = c("ModuleCode", "AcadYear", "Semester", "ClassNo"))
head(mydata) # peek# transform these columns to numeric
for(r in c("Quota", "Bidders", "LowestBid", "LowestSuccessfulBid", "HighestBid", "StartTime"))
{
mydata[,grep(r, names(mydata))] <- as.numeric(mydata[,grep(r, names(mydata))])
}
# transform these columns to factors
for(r in c("AcadYear", "Semester", "ModuleCode", "Round", "StudentAcctType", "DayText", "StudentAcctType", "ModuleTitle", "Group$", "ClassNo"))
{
mydata[,grep(r, names(mydata))] <- factor(mydata[,grep(r, names(mydata))])
}DayText LevelsStudentAcctType LevelsGroup Levels# create new variable that indicates the level of the module, based on their module code
mydata$Level <- factor(ifelse(str_detect(mydata$ModuleCode, "1[0-9][0-9][0-9]"), "Level 1",
ifelse(str_detect(mydata$ModuleCode, "2[0-9][0-9][0-9]"), "Level 2",
ifelse(str_detect(mydata$ModuleCode, "3[0-9][0-9][0-9]"), "Level 3",
ifelse(str_detect(mydata$ModuleCode, "4[0-9][0-9][0-9]"), "Level 4",
"Graduate Module")))))# create vector of the column names which are factors
facnames <- mydata %>% select_if(is.factor) %>% names()
# facnames without ModuleCode and StudentAcctType
facnames.mod <- facnames[-grep("ModuleCode|ModuleTitle", facnames)]
# create vector ofthe column names which are numeric
numnames <- mydata %>% select_if(is.numeric) %>% names()
# numnames without StartTime
numnames.time <- names(select_if(mydata, is.numeric))[-grep("StartTime", numnames)]Bidders is calculated across all academic years, all bidding rounds, all modules…## 'data.frame': 1934 obs. of 20 variables:
## $ AcadYear : Factor w/ 8 levels "2011/2012","2012/2013",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Semester : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
## $ Round : Factor w/ 7 levels "1A","1B","1C",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ModuleCode : Factor w/ 87 levels "PL1101E","PL2131",..: 1 1 2 2 3 3 4 4 5 5 ...
## $ Group : Factor w/ 4 levels "LEC1","LEC2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Quota : num 95 430 5 12 35 35 28 50 25 22 ...
## $ Bidders : num 10 100 3 42 8 3 7 2 8 5 ...
## $ LowestBid : num 1 1 1 205 1 1 1 1 1 1 ...
## $ LowestSuccessfulBid: num 1 1 1 977 1 1 1 1 1 1 ...
## $ HighestBid : num 500 1150 368 1255 500 ...
## $ StudentAcctType : Factor w/ 4 levels "New[P]","NUS[P]",..: 3 1 3 1 3 1 3 1 3 1 ...
## $ Group1 : chr "LECTURE 1" "LECTURE 1" "LECTURE 1" "LECTURE 1" ...
## $ ClassNo : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ ModuleTitle : Factor w/ 85 levels "Abnormal Psychology",..: 34 34 74 74 75 75 8 8 13 13 ...
## $ DayText : Factor w/ 5 levels "Monday","Tuesday",..: 1 1 3 3 2 2 2 2 3 3 ...
## $ StartTime : num 1800 1800 1600 1600 800 800 1200 1200 1400 1400 ...
## $ Level : Factor w/ 4 levels "Level 1","Level 2",..: 1 1 2 2 2 2 3 3 3 3 ...
## $ BidPerQuota : num 0.105 0.233 0.6 3.5 0.229 ...
## $ Period : Factor w/ 2 levels "Morning",">=Afternoon": 2 2 2 2 1 1 2 2 2 2 ...
## $ Category : chr "Core" "Core" "Core" "Core" ...
## Warning in describe(mydata): NAs introduced by coercion
## Warning in describe(mydata): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
# plot the categorical variables
for(r in facnames.mod)
{
plot(
ggplot(data = mydata, aes_string(x = r, fill = r)) +
geom_histogram(stat = "count") +
ylab("Count") +
ggtitle(paste0("Count of ", r)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90),
axis.title.x = element_blank(),
legend.position = "none")
)
}# plot the continuous variables
for(r in numnames)
{
plot(
ggplot(data = mydata, aes_string(x = r, fill = r)) +
geom_histogram(fill = "violetred", alpha = 0.5, bins = 50) +
ylab("Histogram") +
ggtitle(paste0(r)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90),
axis.title.x = element_text())
)
}for(r in 1:length(facnames.mod)) # loop across all factors
{
for(i in 1:length(facnames.mod)) # inner loop
{
if(i == r | i < r)
{ # dont do anything if they are the same or the graph has been made before
} else {
tempform <- paste0("~ ", facnames.mod[r], " + ", facnames.mod[i]) # create formula for xtabs
# temp is a dataframe that is only going to exist in this section and overwritten with each loop
temp <- as.data.frame(xtabs(eval(parse(text = tempform)),
data = mydata,
subset = NULL))
plot(
ggplot(data = temp, aes_string(x = facnames.mod[r], y = facnames.mod[i], fill = "Freq", label = "Freq")) +
geom_tile() +
geom_text() +
scale_fill_gradient(low = "white", high = "violetred") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90),
legend.position = "none")
)
}
}
}for(r in 1:length(numnames)) # loop across all numeric columns
{
for(i in 1:length(numnames)) # inner loop
{
if(i == r | i < r)
{ # dont do anything if they are the same or the graph has been made before
} else {
# create formulas for lm()
tempform.std <- paste0("scale(", numnames[i],")", " ~ ", "scale(", numnames[r], ")") # standardized
tempform <- paste0(numnames[i], " ~ ", numnames[r]) # unstandardized
# regress to get best fit line
stdreg <- lm(eval(parse(text = tempform.std)),
data = mydata) # standardized
reg <- lm(eval(parse(text = tempform)),
data = mydata) # unstandardized
plot(
ggplot(data = mydata, aes_string(x = numnames[r], y = numnames[i])) +
geom_point(color = "violetred", size = 2, alpha = 0.3) +
theme_classic() +
geom_abline(slope = reg$coefficients[2], intercept = reg$coefficients[1], lty = "dashed") +
geom_label(aes(x = Inf, y = Inf, label = paste0("Standardized Regression Coefficient = ",
round(stdreg$coefficients[2],3)),
hjust = 1, vjust = 2)) +
theme(axis.text.x = element_text(angle = 90))
)
}
}
}for(r in facnames.mod) # loop across all factor columns
{
for(i in numnames) # inner loop across all numeric columns
{
plot(
ggplot(data = mydata, aes_string(x = r, y = i, fill = r)) +
geom_boxplot() +
theme_classic() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90))
)
}
} mydata %>%
filter(Level == "Level 4") %>%
filter(Round == "1A") %>%
filter(Quota >= 40) %>%
filter(BidPerQuota > 1) %>%
arrange(-LowestSuccessfulBid) %>%
ggplot(mapping = aes(y = ModuleCode, x = AcadYear, fill = LowestSuccessfulBid, label = ModuleTitle)) +
geom_tile(show.legend = TRUE) +
theme_classic() +
facet_wrap(~ Semester, ncol = 2) +
theme(axis.text.x = element_text(angle = 90),
legend.position = "top",
strip.background = element_rect(fill = "grey30", linetype = "blank"),
strip.text = element_text(color = "white", size = 12)) +
scale_fill_gradient(low = "grey90", high = "red")ggplotly(
mydata %>%
filter(Level == "Level 4") %>%
filter(Round == "1A") %>%
group_by(ModuleCode, ModuleTitle) %>%
summarize(LSB.avg.sem.years = mean(LowestSuccessfulBid)) %>%
ungroup() %>%
mutate(ModuleCode = fct_reorder(ModuleCode, LSB.avg.sem.years)) %>%
ggplot(mapping = aes(x = ModuleCode, y = LSB.avg.sem.years, label = ModuleTitle, fill = LSB.avg.sem.years)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_classic() +
theme(legend.position = "none")
, tooltip = c("x", "label", "y")
, height = 600, width = 400
)ggplotly(
mydata %>%
filter(Level == "Level 4") %>%
filter(Round == "1A") %>%
group_by(ModuleCode, ModuleTitle) %>%
summarize(BPQ.avg.sem.years = mean(BidPerQuota)) %>%
ungroup() %>%
mutate(ModuleCode = fct_reorder(ModuleCode, BPQ.avg.sem.years)) %>%
ggplot(mapping = aes(x = ModuleCode, y = BPQ.avg.sem.years, label = ModuleTitle, fill = BPQ.avg.sem.years)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_classic() +
theme(legend.position = "none")
, tooltip = c("x", "label", "y")
, height = 600, width = 400
)ggplotly(
mydata %>%
filter(Level == "Level 4") %>%
filter(Round == "1A") %>%
group_by(ModuleCode, ModuleTitle) %>%
summarize(quota.avg.sem.years = mean(Quota)) %>%
ungroup() %>%
mutate(ModuleCode = fct_reorder(ModuleCode, quota.avg.sem.years)) %>%
ggplot(mapping = aes(x = ModuleCode, y = quota.avg.sem.years, label = ModuleTitle, fill = quota.avg.sem.years)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_classic() +
theme(legend.position = "none")
, tooltip = c("x", "label", "y")
, height = 600, width = 400
)Lets look at each module and compare the average number of bidders, bidders per quota and lowest successful bids when the lecture begins in and after the morning.
for(r in c("meanBidders", "meanBpQ", "meanLSB"))
{
plot(mydata %>%
group_by(ModuleCode, ModuleTitle, Period) %>%
summarise(meanBidders = mean(Bidders), meanBpQ = mean(BidPerQuota), meanLSB = mean(LowestSuccessfulBid),
sdBidders = sd(Bidders), sdBpQ = sd(BidPerQuota), sdLSB = mean(LowestSuccessfulBid)) %>%
ggplot(aes_string(x = "Period", y = r, fill = "Period")) +
geom_bar(stat = "identity") +
theme_classic() +
theme(axis.text.x = element_blank(),
legend.position = "top",
strip.background = element_rect(fill = "grey30", linetype = "blank"),
strip.text = element_text(color = "white", size = 12)) +
facet_grid(~ ModuleCode:ModuleTitle, labeller = label_wrap_gen(width = 25)) +
ggtitle(r))
}for(r in c("meanBidders", "meanBpQ", "meanLSB"))
{
plot(mydata %>%
group_by(Level, Period) %>%
summarise(meanBidders = mean(Bidders), meanBpQ = mean(BidPerQuota), meanLSB = mean(LowestSuccessfulBid),
sdBidders = sd(Bidders), sdBpQ = sd(BidPerQuota), sdLSB = mean(LowestSuccessfulBid)) %>%
ggplot(aes_string(x = "Period", y = r, fill = "Period")) +
geom_bar(stat = "identity") +
theme_classic() +
theme(axis.text.x = element_blank(),
legend.position = "top",
strip.background = element_rect(fill = "grey30", color = "black"),
strip.text = element_text(color = "white", size = 12)) +
facet_wrap(~ Level) +
ggtitle(r))
}StartTimeHrs12 And Group-Mean Centered Hrs12.
Inadvertently, we began to observe, hypothesize and act on certain trends to guide and maximize our bidding choices. We might even share advices based on these trends. For example:
Modules belonging to the domain of Clinical Psychology are the most popular, so you need to plan ahead and stockpile points from previous semesters if you plan on bidding for them.
This advice came from numerous seniors and I even shared it with my juniors. It became more convincing after observing peers grief over their inability to secure a place in Introduction to Counselling Psychology or Psychological Therapies due to the exhorbitant amount of points required (which required students to stockpile points from previous semesters). But I have never heard anyone claiming that they really wanted to study Cognitive Neuroscience but failed to bid for it.
But was it true that Clinical Psychology modules were the most popular? Rather than inferring trends from personal anecdotes and observation, do we have data to support this claim? The answer is yes! Bidding statistics and other module information are available at https://nusmods.com/api/. All thanks to the team at NUSMods who created a great timetabling tool for all NUS students.
The information was downloaded, extracted, transform, analysis and visualized using R. The codes are available under Codes tab above. The API contains extracted data for all modules from different majors and faculty but I will focus only on Psychology modules in this post as I have greater familiarity with them.
For the typical Psychology major, there are broad four categories of modules.
| Categories | Description |
|---|---|
| Core | Modules that are required for all undergraduates. Include PL1101E, PL2131, PL2132, PL3232 to PL3236. |
| Level 3 Electives | Modules that are outside of the core modules. Between four to six of these are required by all undergraduates to graduate. Their module codes run from PL3237 to PL3260. |
| Level 3 Lab Modules | Lab modules are structured as individual or group research projects in a specific domain of Psychology. Every undergraduate is required to complete at least one of these modules. Their module codes are prefixed with PL328x. |
| Level 4 Honors Modules | Modules that are required to graduate on the Honors track, usually taken near the end of the undergraduate degree. Between three to eight of these are required to graduate. They are prefixed with PL4xxx. |
Within each category, were Clinical Psychology modules most popular?
Luckily, the bidding data contains potential indicators of popularity. These are the key bidding statistics/variables which will be used to compare popularity:
The bar graphs below illustrates the mean Quota, Bidders, BpQ and LSB of each module category, calculated across all modules, semesters and rounds. The different categories vary greatly, and these differences makes it difficult to meaningfully compare popularity across categories.
We define a popular module possessing the following characteristic in Round 1A (the first round of bidding):
Modules that do not fit criteria 1. and 2. will not be considered popular. Amongst these modules, 3. and 4. will be used to determine which modules were most popular.
The bar plot displays the mean LSB of level 4 modules in Round 1A, averaged across all academic year, semesters, lecture slots (for modules with multiple lecture slots) and account types. Only modules with a median available Quota of 40 and above (1.) and BpQ more than 1 (2.) are displayed. Hover over the respective bars to view other statistics such as the mean/median number of Bidders, Quota, BpQ and LSB.
I selected three more trends to investigate.
There used to be a time when Clinical Psychology modules were not popular.
I mentioned that there was a perceived trend that Clinical Psychology modules were really popular, but I recently met a few slightly more senior seniors in the workforce (who graduated almost a decade ago). They told me, much to my surprise, that Clinical Psychology was pretty unpopular back in their days. Was this supported by the data?
A module is less popular when the lecture is early in the day.
Lessons at 8am were painful, I live somewhere in the North-East so I would have to wake up somewhere around 6.15am to arrive comfortably for 8am lessons (including time taken for the shuttle bus in NUS). I generalized from my personal opinion to form this myth, that 8am lessons would be less popular compared to their later counterparts. My friends generally shared the same sentiments, but was it reflected in the bidding statistics? Or are we projecting our laziness to other students?
Module bidding became more competitive in later cohorts, you should bid some amount higher than previous winning bids to account for the inflation in points caused by the increased competitiveness.
Almost everyone checks the bidding statistics from previous iterations of the module to estimate the amount of points to bid. Of course, there should be a positive correlation between past and future bidding statistics but would it be recommended to anchor your bids onto the lowest successful bids from the past? If so, how much higher or lower should we bid compared to previous iterations? There are a few hypothesized reasons for the increased competitiveness, such as the cohort expanding but module quotas remanining constant. Or the tendency for students to always bid some points higher than previous winning bids, which leads to this upward cycle of bid point inflation. Does bid point inflation really exist?